In [10]:
%matplotlib inline 
#above allows plots to discplay on the screen. 

#python includes
import sys

#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization
In [11]:
#Method to load data
def getConllTags(filename):
    #input: filename for a conll style parts of speech tagged file
    #output: a list of list of tuples [sent]. representing [[[word1, tag], [word2, tag2]]
    wordTagsPerSent = [[]]
    sentNum = 0
    with open(filename, encoding='utf8') as f:
        for wordtag in f: 
            wordtag=wordtag.strip()
            if wordtag:#still reading current sentence
                (word, tag) = wordtag.split("\t")
                wordTagsPerSent[sentNum].append((word,tag))
            else:#new sentence
                wordTagsPerSent.append([])
                sentNum+=1
    return wordTagsPerSent  
In [12]:
corpus = 'daily547.conll'
taggedSents = getConllTags(corpus)
print(taggedSents[:2])
[[('Daaammmnn', '!'), ('.', ','), ('Florida', '^'), ('got', 'V'), ('too', 'R'), ('many', 'A'), ('tolls', 'N'), ('..', ','), ('Coach', 'N'), ('comin', 'V'), ('outta', 'P'), ('pocket', 'N'), ('every', 'D'), ('5', '$'), ('mins', 'N'), ('-__-', 'E')], [('RT', '~'), ('@TheBlissfulChef', '@'), (':', '~'), ('Just', 'R'), ('heard', 'V'), ('vegetables', 'N'), ('will', 'V'), ('be', 'V'), ('the', 'D'), ('new', 'A'), ('meat', 'N'), ('in', 'P'), ('2011', '$'), ('.', ','), ('Woot', '!'), ('Woot', '!'), ('!', ',')]]
In [13]:
from pprint import pprint
#keep track of counts here:
wordCounts = dict()
bigramCounts = dict()
trigramCounts = dict()
numTrainingSents = 500

#iterate through each sentence, and extract word and bigram counts
for sent in taggedSents[:numTrainingSents]: 
    words = [word.lower() for word, tag in sent] # grabbing words, droppin gtags
    #print("\nNext Sent:", words)
    for i in range(len(words)):
        try: 
            wordCounts[(words[i],)] += 1
        except KeyError:
            wordCounts[(words[i],)] = 1
            
        #count the bigram
        if (i > 0):
            bigram = (words[i-1],words[i])
            try: 
                bigramCounts[bigram] += 1
            except KeyError:
                bigramCounts[bigram] = 1
                
                
        #count the trigrams
        if (i > 1):
            trigram = (words[i-2], words[i-1], words[i])
            try: 
                trigramCounts[trigram] += 1
            except KeyError:
                trigramCounts[trigram] = 1

pprint(sorted(wordCounts.items(), key=lambda kv: kv[1], reverse=True)[:20])
pprint(sorted(bigramCounts.items(), key=lambda kv: kv[1], reverse=True)[:20])
pprint(sorted(trigramCounts.items(), key=lambda kv: kv[1], reverse=True)[:20])
    
        
[(('.',), 229),
 (('i',), 166),
 ((':',), 143),
 ((',',), 141),
 (('the',), 120),
 (('you',), 119),
 (('rt',), 110),
 (('to',), 107),
 (('a',), 94),
 (('!',), 92),
 (('...',), 80),
 (('"',), 76),
 (('in',), 75),
 (('and',), 70),
 (('is',), 60),
 (('my',), 58),
 (('for',), 50),
 (('that',), 48),
 (('it',), 47),
 (('of',), 46)]
[(('in', 'the'), 16),
 ((':', 'i'), 12),
 ((',', 'but'), 9),
 (('i', "don't"), 9),
 (('.', 'i'), 9),
 (('i', 'think'), 8),
 (('we', 'are'), 8),
 (('if', 'you'), 8),
 ((',', 'i'), 8),
 (('will', 'be'), 7),
 (('i', 'have'), 7),
 (('.', '"'), 7),
 (('my', 'boo'), 7),
 ((':', 'the'), 7),
 (('be', 'the'), 6),
 (('in', 'my'), 6),
 (('how', 'to'), 6),
 (('i', 'love'), 6),
 (('love', 'you'), 6),
 (('i', 'just'), 6)]
[(('like', 'a', 'bum'), 4),
 (('i', 'think', 'i'), 3),
 ((',', 'but', 'i'), 3),
 (("don't", 'know', 'how'), 3),
 (('i', 'love', 'you'), 3),
 (('do', 'what', 'you'), 3),
 (('.', '"', '-'), 3),
 (('"', 'no', 'evil'), 3),
 (('.', 'my', 'boo'), 3),
 (('my', 'boo', '.'), 3),
 (('i', 'will', 'be'), 3),
 (('a', 'bum', ','), 3),
 (('bum', ',', 'i'), 3),
 (('will', 'be', 'the'), 2),
 (('know', 'how', 'to'), 2),
 (('walk', 'in', 'heels'), 2),
 (('about', 'to', 'go'), 2),
 (('i', "don't", 'think'), 2),
 (('think', 'it', 'was'), 2),
 (("don't", 'say', 'that'), 2)]
In [18]:
#specify the model (e.g. bigramCounts or trigramCounts)
ngramCounts = trigramCounts
#ngramCounts = bigramCounts

ngramModelProbs = dict()# stores p(Xi|Xi-1), [x--k...x-1][xi]
for ngram, count in ngramCounts.items():
        p = count / bigramCounts[ngram[0:-1]]
        try: 
            ngramModelProbs[ngram[0:-1]][ngram[-1]] = p #indexed by [x--k...x-1][xi]
        except KeyError:
            ngramModelProbs[ngram[0:-1]] = {ngram[-1]: p}

pprint(sorted(ngramModelProbs[('i','love')].items()))#show probabilities for all words that could follow want
#pprint(sorted(ngramModelProbs[('i',)].items()))#show probabilities for all words that could follow i
[('makeup', 0.16666666666666666),
 ('my', 0.16666666666666666),
 ('when', 0.16666666666666666),
 ('you', 0.5)]
In [ ]:
#if time. generate a sentence
In [ ]:
#saved code in case we want to do one-hot representation

    for sent in taggedSents:
        if sent: 
            words, tags = zip(*sent)
            wordToIndex |= set(words) #union of the words into the set
            tagToNum |= set(tags) #union of all the tags into the set
    print("[Read ", len(taggedSents), " Sentences]")
    #make dictionaries for converting words to index and tags to ids:
    wordToIndex = {w: i for i, w in enumerate(wordToIndex)} 
    numToTag = list(tagToNum) #mapping index to tag
    tagToNum = {numToTag[i]: i for i in range(len(numToTag))}